{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "# https://machinelearningmastery.com/multi-class-classification-tutorial-keras-deep-learning-library/\n",
    "import numpy\n",
    "import pandas\n",
    "import math\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense\n",
    "from keras.wrappers.scikit_learn import KerasClassifier\n",
    "from keras.utils import np_utils\n",
    "from sklearn.model_selection import cross_val_score\n",
    "from sklearn.model_selection import cross_val_predict\n",
    "from sklearn.model_selection import KFold\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.metrics import confusion_matrix\n",
    "from sklearn.metrics import matthews_corrcoef"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize Random Number Generator\n",
    "# fix random seed for reproducibility\n",
    "seed = 7\n",
    "numpy.random.seed(seed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load dataset\n",
    "# dataframe = pandas.read_csv(\"kddforpandatrain.csv\")#, header=True)\n",
    "dataframe = pandas.read_csv(\"kdd_dataset.csv\")#, header=True)  # read the whole 10% dataset into dataframe\n",
    "\n",
    "# samples 3000 random data points from 500k\n",
    "dataframe = dataframe.sample(n=300)\n",
    "\n",
    "# LabelEncoder, turns all our categorical data into integers\n",
    "le = LabelEncoder()\n",
    "\n",
    "# apply \"le.fit_transform\" to every column (usually only works on 1 column)\n",
    "dataframe_encoded = dataframe.apply(le.fit_transform)\n",
    "attack_labels = le.classes_\n",
    "dataset = dataframe_encoded.values\n",
    "\n",
    "#Set X as our input data and Y as our label\n",
    "X = dataset[:,0:41].astype(float)\n",
    "Y = dataset[:,41]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6\n"
     ]
    }
   ],
   "source": [
    "# encode class values as integers\n",
    "encoder = LabelEncoder()\n",
    "encoder.fit(Y)\n",
    "encoded_Y = encoder.transform(Y)\n",
    "# convert integers to dummy variables (i.e. one hot encoded)\n",
    "dummy_y = np_utils.to_categorical(encoded_Y)\n",
    "# print(dummy_y)\n",
    "print(len(dummy_y[0]))\n",
    "num_of_classes = len(dummy_y[0])  # the length of dummy y is the number of classes we have in our small sample\n",
    "# since we are randomly sampling from a large dataset, we might not get 1 of every class in our sample\n",
    "# we need to set output layer to be equal to the length of our dummy_y vectors\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# define baseline model\n",
    "def baseline_model():\n",
    "    # create model\n",
    "    model = Sequential()\n",
    "    \n",
    "    inputs = 41\n",
    "    hidden_layer1 = 18\n",
    "    hidden_layer2 = 6\n",
    "    hidden_layer3 = 0\n",
    "    outputs = num_of_classes  #needs to be this variable in case we forget to sample. Could end up having 10 classes or 12, etc\n",
    "    \n",
    "    model.add(Dense(hidden_layer1, input_dim=inputs, activation='relu'))\n",
    "    if hidden_layer2 != 0:\n",
    "        model.add(Dense(hidden_layer2, activation='relu'))\n",
    "    if hidden_layer3 != 0:\n",
    "        model.add(Dense(hidden_layer3, activation='relu'))\n",
    "    model.add(Dense(outputs, activation='softmax'))\n",
    "    \n",
    "    # Compile model\n",
    "    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[ 67   0   0   0   0   0]\n",
      " [  0  71   1   0   0   0]\n",
      " [  0   2   0   0   0   0]\n",
      " [  0   1   0 156   0   0]\n",
      " [  0   1   0   0   0   0]\n",
      " [  0   1   0   0   0   0]]\n",
      "total: 300\n",
      "accuracy: 0.98\n",
      "Matthews correlation coefficient: 0.9677710189539196\n",
      "Baseline: 89.33% (21.95%)\n"
     ]
    }
   ],
   "source": [
    "\n",
    "estimator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5, verbose=0)\n",
    "\n",
    "\"\"\"\n",
    "trained_classifier = estimator.fit(X, Y)\n",
    "print(type(estimator))\n",
    "# Predicting the Test set results\n",
    "y_pred = estimator.predict(X)\n",
    "\"\"\"\n",
    "\n",
    "\n",
    "\n",
    "kfold = KFold(n_splits=10, shuffle=True, random_state=seed)\n",
    "y_pred = cross_val_predict(estimator, X, dummy_y, cv=kfold)\n",
    "results = cross_val_score(estimator, X, dummy_y, cv=kfold)\n",
    "\n",
    "cm = confusion_matrix(Y, y_pred)\n",
    "print(cm)\n",
    "print(\"total: \" + str(cm.sum()))\n",
    "print(\"accuracy: \" + str(numpy.trace(cm) / cm.sum()))\n",
    "print(\"Matthews correlation coefficient: \" + str(matthews_corrcoef(Y, y_pred)))\n",
    "\n",
    "\n",
    "\n",
    "print(\"Baseline: %.2f%% (%.2f%%)\" % (results.mean()*100, results.std()*100))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Field          TP             FP             FN             TN             MC Rate        MCC            \n",
      "---------------------------------------------------------------------------------------------------\n",
      "\n",
      "neptune.       0.22333        0.00000        0.00000        0.77667        0.00000        1.00000        \n",
      "normal.        0.23667        0.06579        1.00000        0.74333        0.07792        0.15162        \n",
      "portsweep.     0.00000        1.00000        1.00000        0.99000        1.00000        -0.50251       \n",
      "smurf.         0.52000        0.00000        1.00000        0.47667        0.00637        0.33231        \n",
      "teardrop.      0.00000        0.00000        1.00000        0.99667        1.00000        0.00000        \n",
      "warezclient.   0.00000        0.00000        1.00000        0.99667        1.00000        0.00000        \n",
      "\n",
      "Average true positive rate: 0.16333333333333333\n",
      "Average false positive rate: 0.17763157894736845\n",
      "Average false negative rate: 0.8333333333333334\n",
      "Average true negative rate: 0.83\n",
      "Average Misclassification Rate: 0.5140485841122784\n",
      "Matthews Correlation Coefficient: -0.01650955891996516\n"
     ]
    }
   ],
   "source": [
    "def true_positive_rate(cm, i, total):\n",
    "    return cm[i][i]/total\n",
    "    \n",
    "def false_positive_rate(cm,j):\n",
    "    fp_rate = 0\n",
    "    for i in range(0,len(cm)):\n",
    "        if (i != j):\n",
    "            fp_rate += cm[i][j]\n",
    "    if ((fp_rate + cm[j][j]) != 0):\n",
    "        return fp_rate/(fp_rate + cm[j][j])\n",
    "    else:\n",
    "        return 0;\n",
    "        \n",
    "def false_negative_rate(cm, i):\n",
    "    fn_rate = 0;\n",
    "    for j in range(0,len(cm)):\n",
    "        if (i != j):\n",
    "            fn_rate += cm[i][j]\n",
    "    if ((fn_rate + cm[j][j]) != 0):\n",
    "        return fn_rate/(fn_rate + cm[j][j])\n",
    "    else:\n",
    "        return 0;\n",
    "\n",
    "def true_negative_rate(cm,i,total):\n",
    "    tn_rate = 0\n",
    "    for j in range(0,len(cm)):\n",
    "        for k in range(0,len(cm)):\n",
    "            if (j != i and k != i):\n",
    "                tn_rate += cm[j][k]\n",
    "    return tn_rate/total\n",
    "\n",
    "def misclassification_rate(cm,l):\n",
    "    fp_rate = 0\n",
    "    fn_rate = 0\n",
    "    for i in range(0,len(cm)):\n",
    "        if (i != l):\n",
    "            fp_rate += cm[i][l]\n",
    "    for j in range(0,len(cm)):\n",
    "        if (l != j):\n",
    "            fn_rate += cm[l][j]\n",
    "    return (fp_rate + fn_rate)/(fp_rate + fn_rate + cm[l][l])\n",
    "    \n",
    "def avg_true_positive_rate(cm):\n",
    "    tp_rate = 0\n",
    "    for i in range(0,len(cm)):\n",
    "        tp_rate += true_positive_rate(cm,i,cm.sum())\n",
    "    return tp_rate/len(cm)\n",
    "\n",
    "def avg_false_positive_rate(cm):\n",
    "    fp_rate = 0\n",
    "    for i in range(0,len(cm)):\n",
    "        fp_rate += false_positive_rate(cm,i)\n",
    "    return fp_rate/len(cm)\n",
    "\n",
    "def avg_false_negative_rate(cm):\n",
    "    fn_rate = 0\n",
    "    for i in range(0,len(cm)):\n",
    "        fn_rate += false_negative_rate(cm,i)\n",
    "    return fn_rate/len(cm)\n",
    "\n",
    "def avg_true_negative_rate(cm):\n",
    "    tn_rate = 0\n",
    "    for i in range(0,len(cm)):\n",
    "        tn_rate += true_negative_rate(cm,i,cm.sum())\n",
    "    return tn_rate/len(cm)\n",
    "\n",
    "def avg_misclassification_rate(cm):\n",
    "    mc_rate = 0\n",
    "    for i in range(0,len(cm)):\n",
    "        mc_rate += misclassification_rate(cm,i)\n",
    "    return mc_rate/len(cm)\n",
    "\n",
    "def matthews(TP,TN,FP,FN):\n",
    "    if ((TP + FP)*(TP + FN)*(TN + FP)*(TN + FN) == 0):\n",
    "        return 0\n",
    "    return (TP*TN - FP*FN)/math.sqrt((TP + FP)*(TP + FN)*(TN + FP)*(TN + FN))\n",
    "\n",
    "def print_table(cm):\n",
    "    print('{:15}'.format('Field'), end='')\n",
    "    print('{:15}'.format('TP'), end='')\n",
    "    print('{:15}'.format('FP'), end='')\n",
    "    print('{:15}'.format('FN'), end='')\n",
    "    print('{:15}'.format('TN'), end='')\n",
    "    print('{:15}'.format('MC Rate'), end='')\n",
    "    print('{:15}'.format('MCC'), end='')\n",
    "    print()\n",
    "    print('---------------------------------------------------------------------------------------------------')\n",
    "    print()\n",
    "    for i in range(0,len(cm)):\n",
    "        print('{:15}'.format(attack_labels[i]), end='')\n",
    "        print('{:15}'.format('{:.5f}'.format(true_positive_rate(cm,i,cm.sum()))), end='')\n",
    "        print('{:15}'.format('{:.5f}'.format(false_positive_rate(cm,i))), end='')\n",
    "        print('{:15}'.format('{:.5f}'.format(false_negative_rate(cm,i))), end='')\n",
    "        print('{:15}'.format('{:.5f}'.format(true_negative_rate(cm,i,cm.sum()))), end='')\n",
    "        print('{:15}'.format('{:.5f}'.format(misclassification_rate(cm,i))), end='')\n",
    "        print('{:15}'.format('{:.5f}'.format(matthews(true_positive_rate(cm,i,cm.sum()),true_negative_rate(cm,i,cm.sum()),\n",
    "                                                      false_positive_rate(cm,i),false_negative_rate(cm,i)))),end='')\n",
    "        print()\n",
    "    print()\n",
    "\n",
    "print_table(cm)\n",
    "print(\"Average true positive rate: \" + str(avg_true_positive_rate(cm)))\n",
    "print(\"Average false positive rate: \" + str(avg_false_positive_rate(cm)))\n",
    "print(\"Average false negative rate: \" + str(avg_false_negative_rate(cm)))\n",
    "print(\"Average true negative rate: \" + str(avg_true_negative_rate(cm)))\n",
    "print(\"Average Misclassification Rate: \" + str(avg_misclassification_rate(cm)))\n",
    "print(\"Matthews Correlation Coefficient: \" + str(matthews(avg_true_positive_rate(cm),avg_true_negative_rate(cm),avg_false_positive_rate(cm),avg_false_negative_rate(cm))))\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:anaconda3]",
   "language": "python",
   "name": "conda-env-anaconda3-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
